df <- readr::read_tsv("data/EV_TP_10_16.csv")
## Parsed with column specification:
## cols(
## 就診年週 = col_integer(),
## `2010` = col_integer(),
## `2011` = col_integer(),
## `2012` = col_integer(),
## `2013` = col_integer(),
## `2014` = col_integer(),
## `2015` = col_integer(),
## `2016` = col_integer()
## )
df <- df[-1]
df <- df[-nrow(df), ]
names(df)
## [1] "2010" "2011" "2012" "2013" "2014" "2015" "2016"
head(df)
## # A tibble: 6 x 7
## `2010` `2011` `2012` `2013` `2014` `2015` `2016`
## <int> <int> <int> <int> <int> <int> <int>
## 1 86 57 183 114 135 50 119
## 2 92 55 210 100 106 39 101
## 3 80 51 211 110 79 52 114
## 4 87 45 444 106 82 75 94
## 5 90 73 116 86 140 64 80
## 6 105 49 103 99 95 48 245
matplot(df, type="l", col = 1:7)
legend("topleft", legend = names(df), col=names(df), pch=1)
library(ggplot2)
weeks <- 1:52
ggplot(df, aes(x=weeks)) +
geom_line(aes(y = `2010`), color="grey") +
geom_line(aes(y = `2011`), color = "black") +
geom_line(aes(y = `2012`), color = "blue") +
geom_line(aes(y = `2013`), color = "green") +
geom_line(aes(y = `2014`), color = "cyan") +
geom_line(aes(y = `2015`), color = "yellow") +
geom_line(aes(y = `2016`), color = "red") +
ylab(label="Number of infected cases") +
xlab("Week")
library(tidyverse)
library(ggplot2)
library(dplyr)
post <- readRDS("data/posts_tsai.rds")
# sum(is.na(post$shares_count)) # 947
# sum(is.na(post$likes_count)) # 0
# sum(is.na(post$comments_count)) # 0
hour, month, and week.post2016 <- post[post$created_time >= as.POSIXct('2016-01-01'), ]
## Warning in strptime(xx, f <- "%Y-%m-%d %H:%M:%OS", tz = tz): unknown
## timezone 'zone/tz/2018c.1.0/zoneinfo/Asia/Taipei'
post2016 <- post2016[post2016$created_time < as.POSIXct('2017-01-01'),]
post2016 <- post
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.3
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
post2016$hour <- hour(post2016$created_time)
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016$mounth <- month(post2016$created_time)
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016$week <- week(post2016$created_time)
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016$week <- wday(post2016$created_time)
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
summary(post2016)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/Asia/Taipei'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/Asia/Taipei'
## created_time id
## Min. :2008-10-22 21:55:20 Length:4112
## 1st Qu.:2010-11-24 13:59:31 Class :character
## Median :2011-12-28 09:25:28 Mode :character
## Mean :2012-08-26 13:05:56
## 3rd Qu.:2013-12-25 17:06:44
## Max. :2017-01-28 13:10:00
##
## updated_time type message
## Min. :2008-11-13 13:19:43 Length:4112 Length:4112
## 1st Qu.:2011-12-08 10:29:49 Class :character Class :character
## Median :2012-01-01 14:07:49 Mode :character Mode :character
## Mean :2012-12-08 21:47:22
## 3rd Qu.:2014-07-27 13:21:22
## Max. :2017-05-31 00:09:30
##
## shares_count likes_count comments_count hour
## Min. : 1.0 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 11.5 1st Qu.: 82.8 1st Qu.: 3.0 1st Qu.:10.00
## Median : 227.0 Median : 394.0 Median : 39.0 Median :14.00
## Mean : 386.1 Mean : 12955.1 Mean : 360.9 Mean :13.14
## 3rd Qu.: 442.0 3rd Qu.: 19995.0 3rd Qu.: 395.2 3rd Qu.:17.00
## Max. :12015.0 Max. :358502.0 Max. :38298.0 Max. :23.00
## NA's :1565
## mounth week
## Min. : 1.000 Min. :1.000
## 1st Qu.: 6.000 1st Qu.:3.000
## Median : 9.000 Median :4.000
## Mean : 8.373 Mean :4.196
## 3rd Qu.:12.000 3rd Qu.:6.000
## Max. :12.000 Max. :7.000
##
post2016 <- filter(post, created_time > as.POSIXlt('2016-01-01'))
post2016 <- filter(post2016, created_time < as.POSIXlt('2017-01-01'))
post2016 <- mutate(post2016, hour = hour(created_time))
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016 <- mutate(post2016, month = month(created_time))
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016 <- mutate(post2016, week = week(created_time))
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
post2016 <- post %>%
filter(created_time > as.POSIXlt('2016-01-01')) %>%
filter(created_time < as.POSIXlt('2017-01-01')) %>%
mutate(hour = hour(post2016$created_time)) %>%
mutate(month = month(post2016$created_time)) %>%
mutate(week = week(post2016$created_time))
summary(post2016)
## created_time id
## Min. :2016-01-01 12:00:02 Length:489
## 1st Qu.:2016-02-06 05:58:53 Class :character
## Median :2016-05-10 17:33:50 Mode :character
## Mean :2016-05-21 23:50:35
## 3rd Qu.:2016-08-20 10:20:51
## Max. :2016-12-31 14:18:02
##
## updated_time type message
## Min. :2016-01-01 16:00:01 Length:489 Length:489
## 1st Qu.:2016-05-12 01:10:41 Class :character Class :character
## Median :2016-08-17 14:32:46 Mode :character Mode :character
## Mean :2016-08-30 22:01:45
## 3rd Qu.:2017-01-23 18:04:58
## Max. :2017-05-31 00:09:30
##
## shares_count likes_count comments_count hour
## Min. : 1.0 Min. : 278 Min. : 12 Min. : 0.00
## 1st Qu.: 314.0 1st Qu.: 19208 1st Qu.: 699 1st Qu.:12.00
## Median : 471.5 Median : 28359 Median : 973 Median :15.00
## Mean : 807.9 Mean : 38467 Mean : 1572 Mean :14.85
## 3rd Qu.: 817.5 3rd Qu.: 43034 3rd Qu.: 1493 3rd Qu.:18.00
## Max. :8613.0 Max. :358502 Max. :38298 Max. :23.00
## NA's :11
## month week
## Min. : 1.000 Min. : 1.00
## 1st Qu.: 2.000 1st Qu.: 6.00
## Median : 5.000 Median :19.00
## Mean : 5.157 Mean :20.78
## 3rd Qu.: 8.000 3rd Qu.:34.00
## Max. :12.000 Max. :53.00
##
post2016 <- post %>%
# filter(created_time > as.POSIXlt('2016-01-01'),
# created_time < as.POSIXlt('2017-01-01')) %>%
mutate(hour = hour(created_time),
month = month(created_time),
week = week(created_time),
wday = wday(created_time),
year = year(created_time))
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
## Warning in as.POSIXlt.POSIXct(x, tz = tz(x)): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/Asia/Taipei'
summary(post2016)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/Asia/Taipei'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/Asia/Taipei'
## created_time id
## Min. :2008-10-22 21:55:20 Length:4112
## 1st Qu.:2010-11-24 13:59:31 Class :character
## Median :2011-12-28 09:25:28 Mode :character
## Mean :2012-08-26 13:05:56
## 3rd Qu.:2013-12-25 17:06:44
## Max. :2017-01-28 13:10:00
##
## updated_time type message
## Min. :2008-11-13 13:19:43 Length:4112 Length:4112
## 1st Qu.:2011-12-08 10:29:49 Class :character Class :character
## Median :2012-01-01 14:07:49 Mode :character Mode :character
## Mean :2012-12-08 21:47:22
## 3rd Qu.:2014-07-27 13:21:22
## Max. :2017-05-31 00:09:30
##
## shares_count likes_count comments_count hour
## Min. : 1.0 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 11.5 1st Qu.: 82.8 1st Qu.: 3.0 1st Qu.:10.00
## Median : 227.0 Median : 394.0 Median : 39.0 Median :14.00
## Mean : 386.1 Mean : 12955.1 Mean : 360.9 Mean :13.14
## 3rd Qu.: 442.0 3rd Qu.: 19995.0 3rd Qu.: 395.2 3rd Qu.:17.00
## Max. :12015.0 Max. :358502.0 Max. :38298.0 Max. :23.00
## NA's :1565
## month week wday year
## Min. : 1.000 Min. : 1.0 Min. :1.000 Min. :2008
## 1st Qu.: 6.000 1st Qu.:23.0 1st Qu.:3.000 1st Qu.:2010
## Median : 9.000 Median :39.0 Median :4.000 Median :2011
## Mean : 8.373 Mean :34.8 Mean :4.196 Mean :2012
## 3rd Qu.:12.000 3rd Qu.:50.0 3rd Qu.:6.000 3rd Qu.:2013
## Max. :12.000 Max. :53.0 Max. :7.000 Max. :2017
##
likes_byhour <- with(post2016, tapply(likes_count, hour, mean))
comments_byhour <- with(post2016, tapply(comments_count, hour, mean))
summary <- aggregate(cbind(likes_count, comments_count, shares_count)~hour, data=post2016, mean, na.rm=TRUE)
summary <- aggregate(cbind(likes_count, comments_count, shares_count)~hour+week, data=post2016, sum, na.rm=TRUE)
summary <- post2016 %>%
group_by(hour) %>%
summarize(
n = n(),
mean_comment = mean(comments_count, na.rm = T),
mean_share = mean(shares_count, na.rm = T),
mean_like = mean(likes_count, na.rm = T),
sum_comment = sum(comments_count, na.rm = T),
sum_share = sum(shares_count, na.rm = T),
sum_like = sum(likes_count, na.rm = T),
sd_comment = sd(comments_count, na.rm = T),
sd_share = sd(shares_count, na.rm = T),
sd_like = sd(likes_count, na.rm = T)
)
library(tidyr)
t1 <- count(post2016, hour, wday)
t2 <- spread(t1, wday, n, fill=0)
t3 <- right_join(t2, data.frame(hour=0:23))
## Joining, by = "hour"
t3[is.na(t3)] <- 0
myt <- post2016 %>%
count(hour, wday) %>%
spread(wday, n, fill=0) %>%
right_join(data.frame(hour=0:23)) %>%
mutate_all(funs(ifelse(is.na(.), 0, .))) %>%
gather(wday, n, -hour)
## Joining, by = "hour"
post2016 <- post2016 %>%
mutate(nchar = ifelse(is.na(message), 0, log(nchar(message))))
plot(post2016$nchar, post2016$hour, xlab='nchar', ylab='time(hour)')
# pch=1: label style; cex=1: label size
myScatter<- function(x, y, xlab, ylab, ylim){
plot(y~x, xlab=xlab, ylab=ylab, pch=1, ylim = c(1, ylim))
abline(lm(y~x), col="red") # regression line (y~x)
}
par(mfrow=c(3, 1), mai=c(0.3, 0.3, 0.3, 0.3))
with(post2016, myScatter(hour, comments_count, 'comments', 'hour', 5000))
with(post2016, myScatter(hour, shares_count, 'shares', 'hour', 5000))
with(post2016, myScatter(hour, likes_count, 'likes', 'hour', 100000))
selected <- post2016 %>%
select(likes_count, comments_count) %>%
na.omit()
# k-mean cluster
cres <- kmeans(selected, 3)
# cres$centers
# cres$cluster
colors <- c("#FF0000", "#00FF00", "#0000FF")
selected$color <- colors[cres$cluster]
plot(log(selected$likes_count), log(selected$comments_count), col=selected$color)
library(ggplot2)
qplot(comments_count, likes_count, color = type, data = post2016, geom = "jitter", alpha=I(0.2))cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
post2016 %>%
ggplot() +
aes(log(likes_count), year, color = factor(type)) +
geom_point(alpha = 0.5) +
geom_jitter() +
scale_colour_manual(values=cbPalette)
# post2016 %>%
# mutate(hour = hour(created_time),
# month = month(created_time),
# week = week(created_time),
# wday = wday(created_time),
# year = year(created_time)) %>%
# ggplot() +
# aes(year, month) +
# geom_point(alpha = 0.5)
post2016 %>%
ggplot() +
aes(year, month) +
geom_point(alpha = 0.5)
post2016 %>%
ggplot(aes(year)) +
geom_bar(bin_width=0.1)
## Warning: Ignoring unknown parameters: bin_width
geom_bar uses stat_count by default: it counts the number of cases at each x position.geom_col uses stat_identity: it leaves the data as is.post2016 %>%
ggplot() +
aes(hour) +
geom_bar(fill = "red")
post2016 %>%
count(hour) %>%
ggplot() +
aes(hour, n) +
geom_bar(stat = "identity", fill = "red") +
geom_smooth()
## `geom_smooth()` using method = 'loess'
post2016 %>%
count(hour) %>%
ggplot() +
aes(hour, n) +
geom_col(fill = "red") +
geom_smooth()
## `geom_smooth()` using method = 'loess'
post2016 %>%
mutate(nchar = ifelse(is.na(message), 0, nchar(message))) %>%
ggplot(aes(hour, nchar, color=factor(type))) +
geom_point(alpha = 0.1) +
facet_grid(type ~ .) +
geom_smooth(method = "lm") +
geom_jitter() +
scale_colour_brewer(palette="Spectral")
post2016 %>%
ggplot(aes(week, shares_count, color=factor(type))) +
geom_point(alpha = 0.2) +
facet_grid(. ~ type) +
geom_smooth() +
geom_jitter() +
scale_colour_manual(values = c("gray", "red", "gray", "gray", "gray", "gray"))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1565 rows containing non-finite values (stat_smooth).
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Removed 1565 rows containing missing values (geom_point).
## Warning: Removed 1565 rows containing missing values (geom_point).